3장 - 그래프 인과모델

3.1 인과관계에 대해 생각해보기

인과모델은 변수들 사이의 인과 관계를 방향성이 있는 그래프(Directed Acyclic Graph, DAG)로 표현합니다. 노드는 변수를, 화살표는 직접적인 인과 영향을 나타냅니다.

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import graphviz as gr

color = ["0.3", "0.5", "0.7", "0.9"]
linestyle = ["-", "--", ":", "-."]
marker = ["o", "v", "d", "p"]

pd.set_option("display.max_rows", 6)

gr.set_default_format("png");

import pandas as pd

data = pd.read_csv("../data/cross_sell_email.csv")
data

	gender	cross_sell_email	age	conversion
0	0	short	15	0
1	1	short	27	0
2	1	long	17	0
...	...	...	...	...
320	0	no_email	15	0
321	1	no_email	16	0
322	1	long	24	1

323 rows × 4 columns

3.1.1 인과관계 시각화

인과 그래프를 사용하면 복잡한 변수 간의 관계를 시각적으로 명확하게 파악할 수 있습니다.

import graphviz as gr

g_cross_sell = gr.Digraph()

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "age")
g_cross_sell.edge("U", "gender")

g_cross_sell.edge("rnd", "cross_sell_email")
g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("age", "conversion")
g_cross_sell.edge("gender", "conversion")

g_cross_sell

g_cross_sell = gr.Digraph()

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "age")
g_cross_sell.edge("U", "gender")

g_cross_sell.edge("rnd", "cross_sell_email")
g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("age", "conversion")
g_cross_sell.edge("gender", "conversion")

g_cross_sell

# rankdir:LR layers the graph from left to right
g_cross_sell = gr.Digraph(graph_attr={"rankdir": "LR"})

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "X")

g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("X", "conversion")

g_cross_sell

g_cross_sell = gr.Digraph(graph_attr={"rankdir": "LR"})

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "X")

g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("X", "conversion")

g_cross_sell

3.1.2 컨설턴트 영입 여부 결정하기

컨설턴트 영입이 이윤에 미치는 영향을 파악하기 위해 그래프로 모델링해 봅니다.

3.2 그래프 모델 집중 훈련

3.2.1 사슬

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("T", "M")
g.edge("M", "Y")
g.node("M", "M")


g.edge("causal knowledge", "solve problems")
g.edge("solve problems", "job promotion")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("T", "M")
g.edge("M", "Y")
g.node("M", "M")
g.node("M", color="lightgrey", style="filled")


g.edge("causal knowledge", "solve problems")
g.edge("solve problems", "job promotion")
g.node("solve problems", color="lightgrey", style="filled")

g

3.2.2 분기

g = gr.Digraph()


g.edge("X", "Y")
g.edge("X", "T")
g.node("X", "X")

g.edge("statistics", "causal inference")
g.edge("statistics", "machine learning")

g

g = gr.Digraph()

g.edge("good programmer", "can invert a binary tree")
g.edge("good programmer", "good employee")

g

3.2.3 충돌부

g = gr.Digraph()

g.edge("Y", "X")
g.edge("T", "X")

g.edge("statistics", "job promotion")
g.edge("flatter", "job promotion")

g

g = gr.Digraph()

g.edge("Y", "X1")
g.edge("T", "X1")
g.edge("X1", "X2")
g.node("X2", color="lightgrey", style="filled")

g.edge("statistics", "job promotion")
g.edge("flatter", "job promotion")
g.edge("job promotion", "high salary")

g.node("high salary", color="lightgrey", style="filled")

g

3.2.4 연관관계 흐름 커닝 페이퍼

3.2.5 파이썬에서 그래프 쿼리하기

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("C", "A")
g.edge("C", "B")
g.edge("D", "A")
g.edge("B", "E")
g.edge("F", "E")
g.edge("A", "G")

g

import networkx as nx

model = nx.DiGraph(
    [
        ("C", "A"),
        ("C", "B"),
        ("D", "A"),
        ("B", "E"),
        ("F", "E"),
        ("A", "G"),
    ]
)

print("Are D and C dependent?")
print(not (nx.d_separated(model, {"D"}, {"C"}, {})))

print("Are D and C dependent given A?")
print(not (nx.d_separated(model, {"D"}, {"C"}, {"A"})))

print("Are D and C dependent given G?")
print(not (nx.d_separated(model, {"D"}, {"C"}, {"G"})))

Are D and C dependent?
False
Are D and C dependent given A?
True
Are D and C dependent given G?
True

print("Are G and D dependent?")
print(not (nx.d_separated(model, {"G"}, {"D"}, {})))

print("Are G and D dependent given A?")
print(not (nx.d_separated(model, {"G"}, {"D"}, {"A"})))

Are G and D dependent?
True
Are G and D dependent given A?
False

print("Are A and B dependent?")
print(not (nx.d_separated(model, {"A"}, {"B"}, {})))

print("Are A and B dependent given C?")
print(not (nx.d_separated(model, {"A"}, {"B"}, {"C"})))

Are A and B dependent?
True
Are A and B dependent given C?
False

print("Are G and F dependent?")
print(not (nx.d_separated(model, {"G"}, {"F"}, {})))

print("Are G and F dependent given E?")
print(not (nx.d_separated(model, {"G"}, {"F"}, {"E"})))

Are G and F dependent?
False
Are G and F dependent given E?
True

3.3 식별 재해석

consultancy_sev = gr.Digraph(graph_attr={"rankdir": "LR"})
consultancy_sev.edge("profits_prev_6m", "profits_next_6m")
consultancy_sev.edge("profits_prev_6m", "consultancy")

consultancy_sev

consultancy_model_severed = nx.DiGraph(
    [
        ("profits_prev_6m", "profits_next_6m"),
        ("profits_prev_6m", "consultancy"),
        #     ("consultancy", "profits_next_6m"), # causal relationship removed
    ]
)

not (
    nx.d_separated(consultancy_model_severed, {"consultancy"}, {"profits_next_6m"}, {})
)

True

g_consultancy = gr.Digraph(graph_attr={"rankdir": "LR"})
g_consultancy.edge("profits_prev_6m", "profits_next_6m")
g_consultancy.edge("profits_prev_6m", "consultancy")
g_consultancy.edge("consultancy", "profits_next_6m")
g_consultancy.node("profits_prev_6m", color="lightgrey", style="filled")

g_consultancy

3.4 조건부 독립성 가정과 보정 공식

3.5 양수성 가정

3.6 구체적인 식별 예제

df = pd.DataFrame(
    dict(
        profits_prev_6m=[1.0, 1.0, 1.0, 5.0, 5.0, 5.0],
        consultancy=[0, 0, 1, 0, 1, 1],
        profits_next_6m=[1, 1.1, 1.2, 5.5, 5.7, 5.7],
    )
)

df

	profits_prev_6m	consultancy	profits_next_6m
0	1.0	0	1.0
1	1.0	0	1.1
2	1.0	1	1.2
3	5.0	0	5.5
4	5.0	1	5.7
5	5.0	1	5.7

(
    df.query("consultancy==1")["profits_next_6m"].mean()
    - df.query("consultancy==0")["profits_next_6m"].mean()
)

1.666666666666667

avg_df = df.groupby(["consultancy", "profits_prev_6m"])["profits_next_6m"].mean()

avg_df.loc[1] - avg_df.loc[0]

profits_prev_6m
1.0    0.15
5.0    0.20
Name: profits_next_6m, dtype: float64

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("U", "T")
g.edge("U", "Y")
g.edge("T", "M")
g.edge("M", "Y")

g

3.7 교란편향

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("X", "T")
g.edge("X", "Y")
g.edge("T", "Y")

(g.edge("Manager Quality", "Training"),)
(g.edge("Manager Quality", "Engagement"),)
g.edge("Training", "Engagement")

g

3.7.1 대리 교란 요인

g = gr.Digraph()
g.edge("X1", "U")
g.edge("U", "X2")
g.edge("U", "T")
g.edge("T", "Y")
g.edge("U", "Y")

g.edge("Manager Quality", "Team's Attrition")
g.edge("Manager Quality", "Team's Past Performance")
g.edge("Manager's Tenure", "Manager Quality")
g.edge("Manager's Education Level", "Manager Quality")

g.edge("Manager Quality", "Training")
g.edge("Training", "Engagement")
g.edge("Manager Quality", "Engagement")

g

3.7.2 랜덤화 재해석

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("rnd", "T")
g.edge("T", "Y")
g.edge("U", "Y")

g

3.8 선택편향

3.8.1 충돌부 조건부 설정

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "S")
g.edge("T", "Y")
g.edge("Y", "S")
g.node("S", color="lightgrey", style="filled")

(g.edge("RND", "New Feature"),)
(g.edge("New Feature", "Customer Satisfaction"),)
(g.edge("Customer Satisfaction", "NPS"),)
(g.edge("Customer Satisfaction", "Response"),)
(g.edge("New Feature", "Response"),)
g.node("Response", "Response", color="lightgrey", style="filled")

g

nps_model = nx.DiGraph(
    [
        ("RND", "New Feature"),
        #     ("New Feature", "Customer Satisfaction"),
        ("Customer Satisfaction", "NPS"),
        ("Customer Satisfaction", "Response"),
        ("New Feature", "Response"),
    ]
)


not (nx.d_separated(nps_model, {"NPS"}, {"New Feature"}, {"Response"}))

True

np.random.seed(2)
n = 100000
new_feature = np.random.binomial(1, 0.5, n)

satisfaction_0 = np.random.normal(0, 0.5, n)
satisfaction_1 = satisfaction_0 + 0.4
satisfaction = new_feature * satisfaction_1 + (1 - new_feature) * satisfaction_0

nps_0 = np.random.normal(satisfaction_0, 1)
nps_1 = np.random.normal(satisfaction_1, 1)
nps = new_feature * nps_1 + (1 - new_feature) * nps_0


responded = (np.random.normal(0 + new_feature + satisfaction, 1) > 1).astype(int)

tr_df = pd.DataFrame(
    dict(
        new_feature=new_feature, responded=responded, nps_0=nps_0, nps_1=nps_1, nps=nps
    )
)

tr_df_measurable = pd.DataFrame(
    dict(
        new_feature=new_feature,
        responded=responded,
        nps_0=np.nan,
        nps_1=np.nan,
        nps=np.where(responded, nps, np.nan),
    )
)

tr_df.groupby("new_feature").mean()

	responded	nps_0	nps_1	nps
new_feature
0	0.183715	-0.005047	0.395015	-0.005047
1	0.639342	-0.005239	0.401082	0.401082

tr_df_measurable.groupby("new_feature").mean().assign(**{"nps": np.nan})

	responded	nps_0	nps_1	nps
new_feature
0	0.183715	NaN	NaN	NaN
1	0.639342	NaN	NaN	NaN

tr_df_measurable.groupby(["responded", "new_feature"]).mean()

		nps_0	nps_1	nps
responded	new_feature
0	0	NaN	NaN	NaN
0	1	NaN	NaN	NaN
1	0	NaN	NaN	0.314073
1	1	NaN	NaN	0.536106

tr_df.groupby(["responded", "new_feature"]).mean()

		nps_0	nps_1	nps
responded	new_feature
0	0	-0.076869	0.320616	-0.076869
0	1	-0.234852	0.161725	0.161725
1	0	0.314073	0.725585	0.314073
1	1	0.124287	0.536106	0.536106

3.8.2 선택편향 보정

g = gr.Digraph()

g.edge("U", "X")
g.edge("X", "S")
g.edge("U", "Y")
g.edge("T", "Y")
g.edge("T", "S")
g.node("S", color="lightgrey", style="filled")

(g.edge("New Feature", "Customer Satisfaction"),)
(g.edge("Unknown Stuff", "Customer Satisfaction"),)
(g.edge("Unknown Stuff", "Time in App"),)
(g.edge("Time in App", "Response"),)
(g.edge("New Feature", "Response"),)

g.node("Response", "Response", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("X1", "U")
g.edge("U", "X2")
g.edge("X5", "S")
g.edge("U", "Y", style="dashed")
g.edge("U", "S", style="dashed")
g.edge("U", "X3")
g.edge("X3", "S")
g.edge("Y", "X4")
g.edge("X4", "S")
g.edge("T", "X5")
g.edge("T", "Y")
g.edge("T", "S", style="dashed")
g.node("S", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("Y", "X")
g.edge("T", "X")
g.edge("T", "Y")

g;

3.8.3 매개자 조건부 설정

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "M")
g.edge("T", "Y")
g.edge("M", "Y")
g.node("M", color="lightgrey", style="filled")

g.edge("woman", "seniority")
g.edge("woman", "salary")
g.edge("seniority", "salary")
g.node("seniority", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "M")
g.edge("T", "Y")
g.edge("M", "Y")
g.edge("M", "X")
g.node("X", color="lightgrey", style="filled")

g

3.9 요약

g = gr.Digraph(graph_attr={"rankdir": "LR", "ratio": "0.3"})
g.edge("U", "T")
g.edge("U", "Y")
g.edge("T", "Y")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "M")
g.edge("M", "Y")
g.edge("T", "Y")
g.edge("T", "S")
g.edge("Y", "S")

g.node("M", color="lightgrey", style="filled")
g.node("S", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "In-Game Purchase")
g.edge("T", "In-Game Purchase > 0")
g.edge("In-Game Purchase", "In-Game Purchase > 0")

g.node("In-Game Purchase > 0", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("loan amount", "Default at yr=1")
g.edge("Default at yr=1", "Default at yr=2")
g.edge("Default at yr=2", "Default at yr=3")
g.edge("U", "Default at yr=1")
g.edge("U", "Default at yr=2")
g.edge("U", "Default at yr=3")

g.node("Default at yr=1", color="lightgrey", style="filled")
g.node("Default at yr=2", color="darkgrey", style="filled")

g